#loading the packages
library(readr)
library(dplyr)
library(stringr)

In this document I will describe the data frames I used in my analyses, with a particular attention to where I found them.

- get started analysis

#loading the data frame
data_1_FL_getstarted <- read_csv("data_1_FL_getstarted.csv")

The “data_1-FL_getstarted.csv” was retrieved from the EODatascape, with the following specifications:

State:
Florida

County:
All

Boundary Type:
Census Tract

Fields:
From > Economic Profile Data
> ACS - Median Household Income last 12 months (in 2022 Inflation-Adjusted Dollars)
Estimate
> By Race of Householder (the 9 estimates)
> By Age of Householder (the 4 estimates)
> ACS - Median Earnings (Dollars) in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)
Estimate
> By Sex (the 2 estimates)
for a total of 17 variables

From > Housing Data
> ACS - Median Monthly Housing Cost (Occupied Housing Units)
Estimate
> By ownership status (the 2 estimates)
for a total of 3 variables

This yields a data frame with these columns:

data_1_FL_getstarted %>%
  colnames(.)
##  [1] "geoid"                   "geoid_year"             
##  [3] "state"                   "county"                 
##  [5] "state_fips_code"         "county_fips_code"       
##  [7] "med_hh_inc_est"          "med_inc_white_est"      
##  [9] "med_inc_black_est"       "med_inc_ai_an_est"      
## [11] "med_inc_asian_est"       "med_inc_nhopi_est"      
## [13] "med_inc_other_race_est"  "med_inc_2plus_races_est"
## [15] "med_inc_hisp_est"        "med_inc_white_nh_est"   
## [17] "med_inc_age_15_24_est"   "med_inc_age_25_44_est"  
## [19] "med_inc_age_45_64_est"   "med_inc_age_65plus_est" 
## [21] "earn_med_est"            "earn_med_male_est"      
## [23] "earn_med_fem_est"        "housecost_med_est"      
## [25] "housecost_owner_est"     "housecost_rent_est"

that has 5160 rows, for a total of 5160 census tracts, with this distribution of census tracts per county:

data_1_FL_getstarted %>%
  count(county)

There are many negative values in this data frame,

data_1_FL_getstarted %>%
#preserving the rows with negative values
  filter(if_any(everything(), ~ .x < 0)) %>%
#preserving the columns with negative values
  select(where(~ any(.x < 0)))

and in fact they represent jam values, i.e. placeholders for where an actual value is not present (for example if a certain ethnicity does not reside in a specific census tract).

There are no missing values,

#obtaining the number of columns with missing values
rowSums(data_1_FL_getstarted %>%
          summarise(across(everything(), ~ any(is.na(.x)))))
## [1] 0

or duplicated rows.

#checking if the number of distinct and original rows are the same
nrow(data_1_FL_getstarted %>%
       distinct()) == nrow(data_1_FL_getstarted)
## [1] TRUE

- take it further analyses

- part 1

For part 1 I pulled the following tables from the Census Reporter website:

  • B25001, Housing Units Count
  • B25003, Tenure
  • B25002, Occupancy Status
  • B25004, Vacancy Status
  • B25056, Contract Rent
  • B25063, Gross Rent
  • B25070, Gross Rent as a Percentage of Household Income

For all of them the place or summary level chosen was Florida and the data was divided into census tracts. When given the option, the data was downloaded as totals (and not percentages).

#loading the data frames
data_1_FL_housing_units_count <- read_csv("data_1_FL_housing_units_count.csv") #(B25001)
data_1_FL_tenure <- read_csv("data_1_FL_tenure.csv") #(B25003)
data_1_FL_occupancy <- read_csv("data_1_FL_occupancy.csv") #(B25002)
data_1_FL_vacancy_status <- read_csv("data_1_FL_vacancy_status.csv") #(B25004)
data_1_FL_contract_rent <- read_csv("data_1_FL_contract_rent.csv") #(B25056)
data_1_FL_gross_rent <- read_csv("data_1_FL_gross_rent.csv") #(B25063)
data_1_FL_gross_rent_perc <- read_csv("data_1_FL_gross_rent_perc.csv") #(B25070)

I then merged them all together with the following procedure:

#we save the merged data into data_1_FL_takeitfurther_part1
(data_1_FL_takeitfurther_part1 <- data_1_FL_housing_units_count %>%
#we remove the first line, the total for Florida
   slice(-1) %>%
#we clean the geoid column and create the County one
   mutate(geoid = str_sub(geoid, 8),
          County = str_extract(name,"(?<=,)[^,]+(?=,)"),
          County = str_trim(str_c(County, "County", sep = " "))) %>%
#we change name to geoid, select County and change name for the Housing Units count 
   select("Census Tract" = geoid,
          last_col(),
          "Housing Units Count" = B25001001) %>%
#we merge vertically through rows' indexes
   bind_cols(data_1_FL_occupancy %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the Occupied and Vacant counts
               select(Occupied = 5,
                      Vacant = 7)) %>%
#we merge vertically through rows' indexes  
   bind_cols(data_1_FL_tenure %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the Owner and Renter occupied counts
               select("Owner occupied" = 5,
                      "Renter occupied" = 7)) %>%
#we merge vertically through rows' indexes   
   bind_cols(data_1_FL_vacancy_status %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the counts of interest
               select("For rent" = 5,
                      "Rented, not occupied" = 7,
                      "For sale only" = 9,
                      "Sold, not occupied" = 11,
                      "For seasonal, recreational, or occasional use" = 13,
                      "For migrant workers" = 15,
                      "Other Vacant" = 17)) %>%
#we move the Vacant column after Renter Occupied
   relocate(Vacant, .after = `Renter occupied`) %>%
#we merge vertically through rows' indexes   
   bind_cols(data_1_FL_contract_rent %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the counts of interest
               select("$100 or less (contract)" = 7,
                      "$100 to $149 (contract)" = 9,
                      "$150 to $199 (contract)" = 11,
                      "$200 to $249 (contract)" = 13,
                      "$250 to $299 (contract)" = 15,
                      "$300 to $349 (contract)" = 17,
                      "$350 to $399 (contract)" = 19,
                      "$400 to $449 (contract)" = 21,
                      "$450 to $499 (contract)" = 23,
                      "$500 to $549 (contract)" = 25,
                      "$550 to $599 (contract)" = 27,
                      "$600 to $649 (contract)" = 29,
                      "$650 to $699 (contract)" = 31,
                      "$700 to $749 (contract)" = 33,
                      "$750 to $799 (contract)" = 35,
                      "$800 to $899 (contract)" = 37,
                      "$900 to $999 (contract)" = 39,
                      "$1,000 to $1,249 (contract)" = 41,
                      "$1,250 to $1,499 (contract)" = 43,
                      "$1,500 to $1,999 (contract)" = 45,
                      "$2,000 to $2,499 (contract)" = 47,
                      "$2,500 to $2,999 (contract)" = 49,
                      "$3,000 to $3,499 (contract)" = 51,
                      "$3,500 or more (contract)" = 53,
                      "No cash rent (contract)" = 55)) %>%
#we merge vertically through rows' indexes   
   bind_cols(data_1_FL_gross_rent %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the counts of interest
               select("$100 or less (gross)" = 7,
                      "$100 to $149 (gross)" = 9,
                      "$150 to $199 (gross)" = 11,
                      "$200 to $249 (gross)" = 13,
                      "$250 to $299 (gross)" = 15,
                      "$300 to $349 (gross)" = 17,
                      "$350 to $399 (gross)" = 19,
                      "$400 to $449 (gross)" = 21,
                      "$450 to $499 (gross)" = 23,
                      "$500 to $549 (gross)" = 25,
                      "$550 to $599 (gross)" = 27,
                      "$600 to $649 (gross)" = 29,
                      "$650 to $699 (gross)" = 31,
                      "$700 to $749 (gross)" = 33,
                      "$750 to $799 (gross)" = 35,
                      "$800 to $899 (gross)" = 37,
                      "$900 to $999 (gross)" = 39,
                      "$1,000 to $1,249 (gross)" = 41,
                      "$1,250 to $1,499 (gross)" = 43,
                      "$1,500 to $1,999 (gross)" = 45,
                      "$2,000 to $2,499 (gross)" = 47,
                      "$2,500 to $2,999 (gross)" = 49,
                      "$3,000 to $3,499 (gross)" = 51,
                      "$3,500 or more (gross)" = 53,
                      "No cash gross rent (gross)" = 55)) %>%
#we merge vertically through rows' indexes   
   bind_cols(data_1_FL_gross_rent_perc %>%
#we remove the first line, the total for Florida               
               slice(-1) %>%
#we select and change names for the counts of interest
               select("10.0 percent or less" = 5,
                      "10.0 to 14.9 percent" = 7,
                      "15.0 to 19.9 percent" = 9,
                      "20.0 to 24.9 percent" = 11,
                      "25.0 to 29.9 percent" = 13,
                      "30.0 to 34.9 percent" = 15,
                      "35.0 to 39.9 percent" = 17,
                      "40.0 to 49.9 percent" = 19,
                      "50.0 percent or more" = 21,
                      "Not computed" = 23)))

We export the data frame as “data_1_FL_takeitfurther_part1.csv”

write_csv(data_1_FL_takeitfurther_part1, "data_1_FL_takeitfurther_part1.csv")

The data frame has these columns,

data_1_FL_takeitfurther_part1 %>%
  colnames(.)
##  [1] "Census Tract"                                 
##  [2] "County"                                       
##  [3] "Housing Units Count"                          
##  [4] "Occupied"                                     
##  [5] "Owner occupied"                               
##  [6] "Renter occupied"                              
##  [7] "Vacant"                                       
##  [8] "For rent"                                     
##  [9] "Rented, not occupied"                         
## [10] "For sale only"                                
## [11] "Sold, not occupied"                           
## [12] "For seasonal, recreational, or occasional use"
## [13] "For migrant workers"                          
## [14] "Other Vacant"                                 
## [15] "$100 or less (contract)"                      
## [16] "$100 to $149 (contract)"                      
## [17] "$150 to $199 (contract)"                      
## [18] "$200 to $249 (contract)"                      
## [19] "$250 to $299 (contract)"                      
## [20] "$300 to $349 (contract)"                      
## [21] "$350 to $399 (contract)"                      
## [22] "$400 to $449 (contract)"                      
## [23] "$450 to $499 (contract)"                      
## [24] "$500 to $549 (contract)"                      
## [25] "$550 to $599 (contract)"                      
## [26] "$600 to $649 (contract)"                      
## [27] "$650 to $699 (contract)"                      
## [28] "$700 to $749 (contract)"                      
## [29] "$750 to $799 (contract)"                      
## [30] "$800 to $899 (contract)"                      
## [31] "$900 to $999 (contract)"                      
## [32] "$1,000 to $1,249 (contract)"                  
## [33] "$1,250 to $1,499 (contract)"                  
## [34] "$1,500 to $1,999 (contract)"                  
## [35] "$2,000 to $2,499 (contract)"                  
## [36] "$2,500 to $2,999 (contract)"                  
## [37] "$3,000 to $3,499 (contract)"                  
## [38] "$3,500 or more (contract)"                    
## [39] "No cash rent (contract)"                      
## [40] "$100 or less (gross)"                         
## [41] "$100 to $149 (gross)"                         
## [42] "$150 to $199 (gross)"                         
## [43] "$200 to $249 (gross)"                         
## [44] "$250 to $299 (gross)"                         
## [45] "$300 to $349 (gross)"                         
## [46] "$350 to $399 (gross)"                         
## [47] "$400 to $449 (gross)"                         
## [48] "$450 to $499 (gross)"                         
## [49] "$500 to $549 (gross)"                         
## [50] "$550 to $599 (gross)"                         
## [51] "$600 to $649 (gross)"                         
## [52] "$650 to $699 (gross)"                         
## [53] "$700 to $749 (gross)"                         
## [54] "$750 to $799 (gross)"                         
## [55] "$800 to $899 (gross)"                         
## [56] "$900 to $999 (gross)"                         
## [57] "$1,000 to $1,249 (gross)"                     
## [58] "$1,250 to $1,499 (gross)"                     
## [59] "$1,500 to $1,999 (gross)"                     
## [60] "$2,000 to $2,499 (gross)"                     
## [61] "$2,500 to $2,999 (gross)"                     
## [62] "$3,000 to $3,499 (gross)"                     
## [63] "$3,500 or more (gross)"                       
## [64] "No cash gross rent (gross)"                   
## [65] "10.0 percent or less"                         
## [66] "10.0 to 14.9 percent"                         
## [67] "15.0 to 19.9 percent"                         
## [68] "20.0 to 24.9 percent"                         
## [69] "25.0 to 29.9 percent"                         
## [70] "30.0 to 34.9 percent"                         
## [71] "35.0 to 39.9 percent"                         
## [72] "40.0 to 49.9 percent"                         
## [73] "50.0 percent or more"                         
## [74] "Not computed"

and 5160 rows, for a total of 5160 census tracts, with this distribution of census tracts per county:

data_1_FL_takeitfurther_part1 %>%
  count(County)

There are no negative

data_1_FL_takeitfurther_part1 %>%
#preserving the rows with negative values
  filter(if_any(everything(), ~ .x < 0)) %>%
#preserving the columns with negative values
  select(where(~ any(.x < 0)))

or missing values,

#obtaining the number of columns with missing values
rowSums(data_1_FL_takeitfurther_part1 %>%
  summarise(across(everything(), ~ any(is.na(.x)))))
## [1] 0

nor duplicated rows.

#checking if the number of distinct and original rows are the same
nrow(data_1_FL_takeitfurther_part1 %>%
       distinct()) == nrow(data_1_FL_takeitfurther_part1)
## [1] TRUE

- part 2

The procedure for part 2 was the same, just the tables were different:

  • B25009, Tenure by Household Size
  • B25042, Tenure by Bedrooms
  • B25068, Bedrooms by Gross Rent
data_1_FL_tenure_by_household_size <- read_csv("data_1_FL_tenure_by_household_size.csv") #(B25009)
data_1_FL_tenure_by_bedrooms <- read_csv("data_1_FL_tenure_by_bedrooms.csv") #(B25042)
data_1_FL_bedrooms_by_gross_rent <- read_csv("data_1_FL_bedrooms_by_gross_rent.csv") #(B25068)
#we save the merged data into data_1_FL_takeitfurther_part2
(data_1_FL_takeitfurther_part2 <- data_1_FL_tenure_by_household_size  %>%
#we remove the first line, the total for Florida
   slice(-1) %>%
#we clean the geoid column and create the County one
   mutate(geoid = str_sub(geoid, 8),
          County = str_extract(name,"(?<=,)[^,]+(?=,)"),
          County = str_trim(str_c(County, "County", sep = " "))) %>%
#we change name to geoid, select County and change names for household sizes counts 
   select("Census Tract" = geoid,
          last_col(),
          "1-person (owner)" = 7,
          "2-person (owner)" = 9,
          "3-person (owner)" = 11,
          "4-person (owner)" = 13,
          "5-person (owner)" = 15,
          "6-person (owner)" = 17,
          "7-or-more person (owner)" = 19,
          "1-person (renter)" = 23,
          "2-person (renter)" = 25,
          "3-person (renter)" = 27,
          "4-person (renter)" = 29,
          "5-person (renter)" = 31,
          "6-person (renter)" = 33,
          "7-or-more person (renter)" = 35) %>%
#we merge vertically through rows' indexes  
  bind_cols(data_1_FL_tenure_by_bedrooms  %>%
#we remove the first line, the total for Florida
              slice(-1) %>%
#we change names to the bedrooms counts 
              select("No bedroom (owner)" = 7,
                     "1 bedroom (owner)" = 9,
                     "2 bedrooms (owner)" = 11,
                     "3 bedrooms (owner)" = 13,
                     "4 bedrooms (owner)" = 15,
                     "5 or more bedrooms (owner)" = 17,
                     "No bedroom (renter)" = 21,
                     "1 bedroom (renter)" = 23,
                     "2 bedrooms (renter)" = 25,
                     "3 bedrooms (renter)" = 27,
                     "4 bedrooms (renter)" = 29,
                     "5 or more bedrooms (renter)" = 31)) %>%
#we merge vertically through rows' indexes  
  bind_cols(data_1_FL_bedrooms_by_gross_rent %>%
#we remove the first line, the total for Florida
              slice(-1) %>%
#we change names to the rent by bedrooms counts 
              select("Less than $300 (No bedroom)" = 9,
                     "$300 to $499 (No bedroom)" = 11,
                     "$500 to $749 (No bedroom)" = 13,
                     "$750 to $999 (No bedroom)" = 15,
                     "$1,000 to $1,499 (No bedroom)" = 17,
                     "$1,500 or more (No bedroom)" = 19,
                     "No cash rent (No bedroom)" = 21,
                     "Less than $300 (1 bedroom)" = 27,
                     "$300 to $499 (1 bedroom)" = 29,
                     "$500 to $749 (1 bedroom)" = 31,
                     "$750 to $999 (1 bedroom)" = 33,
                     "$1,000 to $1,499 (1 bedroom)" = 35,
                     "$1,500 or more (1 bedroom)" = 37,
                     "No cash rent (1 bedroom)" = 39,
                     "Less than $300 (2 bedrooms)" = 45,
                     "$300 to $499 (2 bedrooms)" = 47,
                     "$500 to $749 (2 bedrooms)" = 49,
                     "$750 to $999 (2 bedrooms)" = 51,
                     "$1,000 to $1,499 (2 bedrooms)" = 53,
                     "$1,500 or more (2 bedrooms)" = 55,
                     "No cash rent (2 bedrooms)" = 57,
                     "Less than $300 (3 or more bedrooms)" = 63,
                     "$300 to $499 (3 or more bedrooms)" = 65,
                     "$500 to $749 (3 or more bedrooms)" = 67,
                     "$750 to $999 (3 or more bedrooms)" = 69,
                     "$1,000 to $1,499 (3 or more bedrooms)" = 71,
                     "$1,500 or more (3 or more bedrooms)" = 73,
                     "No cash rent (3 or more bedrooms)" = 75)))
write_csv(data_1_FL_takeitfurther_part2, "data_1_FL_takeitfurther_part2.csv")

That produces a data frame with these columns,

data_1_FL_takeitfurther_part2 %>%
  colnames(.)
##  [1] "Census Tract"                         
##  [2] "County"                               
##  [3] "1-person (owner)"                     
##  [4] "2-person (owner)"                     
##  [5] "3-person (owner)"                     
##  [6] "4-person (owner)"                     
##  [7] "5-person (owner)"                     
##  [8] "6-person (owner)"                     
##  [9] "7-or-more person (owner)"             
## [10] "1-person (renter)"                    
## [11] "2-person (renter)"                    
## [12] "3-person (renter)"                    
## [13] "4-person (renter)"                    
## [14] "5-person (renter)"                    
## [15] "6-person (renter)"                    
## [16] "7-or-more person (renter)"            
## [17] "No bedroom (owner)"                   
## [18] "1 bedroom (owner)"                    
## [19] "2 bedrooms (owner)"                   
## [20] "3 bedrooms (owner)"                   
## [21] "4 bedrooms (owner)"                   
## [22] "5 or more bedrooms (owner)"           
## [23] "No bedroom (renter)"                  
## [24] "1 bedroom (renter)"                   
## [25] "2 bedrooms (renter)"                  
## [26] "3 bedrooms (renter)"                  
## [27] "4 bedrooms (renter)"                  
## [28] "5 or more bedrooms (renter)"          
## [29] "Less than $300 (No bedroom)"          
## [30] "$300 to $499 (No bedroom)"            
## [31] "$500 to $749 (No bedroom)"            
## [32] "$750 to $999 (No bedroom)"            
## [33] "$1,000 to $1,499 (No bedroom)"        
## [34] "$1,500 or more (No bedroom)"          
## [35] "No cash rent (No bedroom)"            
## [36] "Less than $300 (1 bedroom)"           
## [37] "$300 to $499 (1 bedroom)"             
## [38] "$500 to $749 (1 bedroom)"             
## [39] "$750 to $999 (1 bedroom)"             
## [40] "$1,000 to $1,499 (1 bedroom)"         
## [41] "$1,500 or more (1 bedroom)"           
## [42] "No cash rent (1 bedroom)"             
## [43] "Less than $300 (2 bedrooms)"          
## [44] "$300 to $499 (2 bedrooms)"            
## [45] "$500 to $749 (2 bedrooms)"            
## [46] "$750 to $999 (2 bedrooms)"            
## [47] "$1,000 to $1,499 (2 bedrooms)"        
## [48] "$1,500 or more (2 bedrooms)"          
## [49] "No cash rent (2 bedrooms)"            
## [50] "Less than $300 (3 or more bedrooms)"  
## [51] "$300 to $499 (3 or more bedrooms)"    
## [52] "$500 to $749 (3 or more bedrooms)"    
## [53] "$750 to $999 (3 or more bedrooms)"    
## [54] "$1,000 to $1,499 (3 or more bedrooms)"
## [55] "$1,500 or more (3 or more bedrooms)"  
## [56] "No cash rent (3 or more bedrooms)"

with 5160 rows, for a total of 5160 census tracts, with this distribution of census tracts per county:

data_1_FL_takeitfurther_part2 %>%
  count(County)

With again no negative

data_1_FL_takeitfurther_part2 %>%
#preserving the rows with negative values
  filter(if_any(everything(), ~ .x < 0)) %>%
#preserving the columns with negative values
  select(where(~ any(.x < 0)))

or missing values,

#obtaining the number of columns with missing values
rowSums(data_1_FL_takeitfurther_part2 %>%
  summarise(across(everything(), ~ any(is.na(.x)))))
## [1] 0

nor duplicated rows.

#checking if the number of distinct and original rows are the same
nrow(data_1_FL_takeitfurther_part2 %>%
       distinct()) == nrow(data_1_FL_takeitfurther_part2)
## [1] TRUE